File set-up

Set working directory to current directory

if (rstudioapi::isAvailable()) {
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}

Load standard libraries and resolve conflicts

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(conflicted)
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
conflict_prefer("select", "dplyr")
## [conflicted] Will prefer dplyr::select over any other package.
conflict_prefer("slice", "dplyr")
## [conflicted] Will prefer dplyr::slice over any other package.
conflict_prefer("rename", "dplyr")
## [conflicted] Will prefer dplyr::rename over any other package.
conflict_prefer('intersect', 'dplyr')
## [conflicted] Will prefer dplyr::intersect over any other package.

Read data

cq = read_tsv('../data/Supplementary_Table_3_selected_circRNAs.txt')
## Rows: 1560 Columns: 48
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (27): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (21): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_circ = read_tsv("../data/Supplementary_Table_2_all_circRNAs.txt")
## Rows: 1137099 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (16): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group...
## dbl  (7): start, end, BSJ_count, n_detected, n_db, estim_len_in, BSJ_count_m...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
val = read_tsv('../data/Supplementary_Table_6A_precision_values.txt')
## Rows: 29 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): tool, count_group
## dbl (16): nr_qPCR_total, nr_qPCR_fail, nr_qPCR_val, nr_RR_total, nr_RR_fail,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cq
all_circ

Precision

Presence in db

make count table

cont_table = cq %>%
  select(circ_id, cell_line, compound_val, count_group_median, n_db) %>%
  # only keep high abundant circRNAs
  filter(count_group_median == 'count ≥ 5') %>%
  # filter out circ that are NAs for amplicon sequencing validation
  filter(!is.na(compound_val)) %>%
  # change nr of databases to binary
  mutate(n_db = ifelse(is.na(n_db), 0, 1)) %>%
  group_by(n_db) %>%
  count(compound_val) %>%
  pivot_wider(values_from = n, names_from = compound_val)  %>%
  ungroup() %>%
  select(-n_db)



cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("novel", "in_db")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
##             fail      pass
## novel   9.871795  67.12821
## in_db 110.128205 748.87179

chisquare test

chisq.test(cont_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 337.48, df = 1, p-value < 2.2e-16

OR

OR = (cont_table[2,2]/cont_table[2,1])/(cont_table[1,2]/cont_table[1,1])
OR
## [1] 57.08276

Canonical splicing

cont_table = cq %>%
  filter(count_group_median == 'count ≥ 5',
         !strand == 'unknown') %>%
  select(circ_id, cell_line, compound_val, ss_motif) %>%
  unique() %>%
  mutate(ss_can = ifelse(ss_motif == "AGNGT", 1, 0)) %>%
  # filter out NAs
  filter(!is.na(compound_val)) %>%
  # change nr of databases to binary
  group_by(ss_can) %>%
  count(compound_val) %>%
  pivot_wider(values_from = n, names_from = compound_val) %>%
  ungroup() %>%
  select(-ss_can)


cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("non_cannonical", "cannonical")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
##                     fail      pass
## non_cannonical  4.090909  27.90909
## cannonical     85.909091 586.09091

chisquare test

chisq.test(cont_table)
## Warning in chisq.test(cont_table): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 134.58, df = 1, p-value < 2.2e-16

OR

OR = (cont_table[2,2]/cont_table[2,1])/(cont_table[1,2]/cont_table[1,1])
OR
## [1] 41.16667

Known annotation

cont_table = cq %>%
  filter(count_group_median == "count ≥ 5") %>%
  select(circ_id, cell_line, compound_val, ENST) %>%
  unique() %>%
  # filter out NAs
  filter(!is.na(compound_val)) %>%
  # make ENST match binary
  mutate(ENST_group = ifelse(is.na(ENST), 'no_match', "match")) %>% 
  # change nr of databases to binary
  group_by(ENST_group) %>%
  count(compound_val) %>%
  pivot_wider(values_from = n, names_from = compound_val) %>%
  ungroup() %>%
  select(-ENST_group)


cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("match", "no_match")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
##               fail      pass
## match    106.79143 689.20857
## no_match  12.20857  78.79143

chisquare test

chisq.test(cont_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 179.73, df = 1, p-value < 2.2e-16

OR

OR = (cont_table[1,2]/cont_table[1,1])/(cont_table[2,2]/cont_table[2,1])
OR
## [1] 16.41331

CircRNA detection tool approach

cont_table = cq %>%
  filter(count_group_median == "count ≥ 5") %>%
  left_join(read_tsv('../data/details/tool_annotation.txt')) %>%
  select(circ_id, cell_line, compound_val, approach) %>%
  filter(!approach == 'integrative') %>%
  unique() %>%
  # filter out NAs
  filter(!is.na(compound_val)) %>%
  group_by(approach) %>%
  count(compound_val) %>%
  pivot_wider(values_from = n, names_from = compound_val) %>%
  ungroup() %>%
  select(-approach)
## Rows: 16 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool, approach, tool_lt, lin_annotation, strand_anno, splicing, BSJ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(tool)`
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("candidate-based", "segmented read-based")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
##                          fail     pass
## candidate-based      27.67367 165.3263
## segmented read-based 88.32633 527.6737

chisquare test

chisq.test(cont_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 8.2103, df = 1, p-value = 0.004165

OR

OR = (cont_table[1,2]/cont_table[1,1])/(cont_table[2,2]/cont_table[2,1])
OR
## [1] 2.327249

BSJ count group

cont_table = cq %>%
  select(circ_id, cell_line, compound_val, count_group_median) %>%
  unique() %>%
  # filter out NAs
  filter(!is.na(compound_val)) %>%
  group_by(count_group_median) %>%
  count(compound_val) %>%
  pivot_wider(values_from = n, names_from = compound_val) %>%
  ungroup() %>%
  select(-count_group_median)


cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("count < 5", "count ≥ 5")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
##                fail     pass
## count < 5  53.64444 230.3556
## count ≥ 5 167.35556 718.6444

chisquare test

chisq.test(cont_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 69.504, df = 1, p-value < 2.2e-16

OR

OR = (cont_table[2,2]/cont_table[2,1])/(cont_table[1,2]/cont_table[1,1])
OR
## [1] 3.612245

Sensitivity

tool_anno = read_tsv('../data/details/tool_annotation.txt')
## Rows: 16 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool, approach, tool_lt, lin_annotation, strand_anno, splicing, BSJ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sens = read_tsv('../data/Supplementary_Table_6B_sensitivity_values.txt')
## Rows: 32 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group_median
## dbl (6): nr_detected, nr_expected, sensitivity, perc_compound_val, total_n, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

add annotation to sensitivity

sens_anno = sens %>% 
  left_join(tool_anno) %>%
  select(-tool_lt) %>% ungroup() %>%
  filter(count_group_median == 'count ≥ 5')
## Joining with `by = join_by(tool)`
sens_anno
cq %>% select(circ_id, compound_val, cell_line) %>% unique() %>% count(compound_val)

Detection approach: segmented read-based VS candidate-based

wilcox.test(sensitivity ~ approach, data=sens_anno %>% filter(!approach == 'integrative')) 
## 
##  Wilcoxon rank sum exact test
## 
## data:  sensitivity by approach
## W = 11, p-value = 0.456
## alternative hypothesis: true location shift is not equal to 0
kruskal.test(sensitivity ~ approach, data=sens_anno)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  sensitivity by approach
## Kruskal-Wallis chi-squared = 0.63636, df = 2, p-value = 0.7275

Detection based on annotation: known splice sites VS entire genome

wilcox.test(sensitivity ~ lin_annotation, data=sens_anno)
## 
##  Wilcoxon rank sum exact test
## 
## data:  sensitivity by lin_annotation
## W = 19, p-value = 1
## alternative hypothesis: true location shift is not equal to 0

Strand annotation method

kruskal.test(sensitivity ~ strand_anno, data=sens_anno %>% 
               filter(!strand_anno == "no strand reported"))
## 
##  Kruskal-Wallis rank sum test
## 
## data:  sensitivity by strand_anno
## Kruskal-Wallis chi-squared = 3.4962, df = 3, p-value = 0.3213

Splicing: canonical VS non-canonical

wilcox_ss = wilcox.test(sensitivity ~ splicing, data=sens_anno) 

wilcox_ss
## 
##  Wilcoxon rank sum exact test
## 
## data:  sensitivity by splicing
## W = 47, p-value = 0.02747
## alternative hypothesis: true location shift is not equal to 0
sens_anno %>%
  group_by(splicing) %>%
  summarise(mean_sens = mean(sensitivity))

method 1

N = nrow(sens_anno)
N
## [1] 16
Z = qnorm(wilcox_ss$p.value/2)
Z
## [1] -2.204737
r = abs(Z)/sqrt(N)
r
## [1] 0.5511843

method 2

library(rstatix)
library(ggpubr)
sens_anno %>% rstatix::wilcox_test(sensitivity ~ splicing)
sens_anno %>% rstatix::wilcox_effsize(sensitivity ~ splicing)
#sens_anno %>% rstatix::wilcox_effsize(sensitivity ~ splicing, ci = T)

Median difference in sensitivity

median_diff = tibble()

for (tool_can in sens_anno %>% filter(splicing == "canonical") %>% pull(tool)) {
  sens_can = sens_anno %>% filter(tool == tool_can) %>% pull(sensitivity)
  
  for (tool_non_can in sens_anno %>% filter(splicing == "non-canonical") %>% pull(tool)) {
    sens_non_can = sens_anno %>% filter(tool == tool_non_can) %>% pull(sensitivity)
    
    median_diff = median_diff %>%
      bind_rows(tibble(tool_can, sens_can, tool_non_can, sens_non_can))
    
  }
}

median_diff = median_diff %>%
  mutate(sens_diff = sens_can - sens_non_can)

median_diff
median_diff %>% pull(sens_diff) %>% median()
## [1] 0.1603651

Filtering

wilcox.test(sensitivity ~ BSJ_filter, data=sens_anno) 
## 
##  Wilcoxon rank sum exact test
## 
## data:  sensitivity by BSJ_filter
## W = 27, p-value = 1
## alternative hypothesis: true location shift is not equal to 0

Correlation sensitivity and theoretical nr of TPs (extrapolated sensitivity)

below 5

sens_tmp = sens %>% filter(count_group_median == "count < 5",
                           !tool == "Sailfish-cir")
cor.test(sens_tmp$sensitivity, sens_tmp$extrapolated_sensitivity, method = 'spearman')
## Warning in cor.test.default(sens_tmp$sensitivity,
## sens_tmp$extrapolated_sensitivity, : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  sens_tmp$sensitivity and sens_tmp$extrapolated_sensitivity
## S = 62.17, p-value = 0.0004567
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8292042

above 5

sens_tmp = sens %>% filter(count_group_median == "count ≥ 5",
                           !tool == "Sailfish-cir")
cor.test(sens_tmp$sensitivity, sens_tmp$extrapolated_sensitivity, method = 'spearman')
## 
##  Spearman's rank correlation rho
## 
## data:  sens_tmp$sensitivity and sens_tmp$extrapolated_sensitivity
## S = 124, p-value = 0.000988
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7785714

Cell lines - statistics

Chi-squared

make count table

cont_table = cq %>%
  select(circ_id, cell_line, compound_val, count_group_median) %>%
  unique() %>%
  # only keep high abundant circRNAs
  filter(count_group_median == 'count ≥ 5') %>%
  # to use all val together
  filter(!is.na(compound_val)) %>%
  # change nr of databases to binary
  group_by(cell_line) %>%
  count(compound_val) %>%
  pivot_wider(values_from = n, names_from = compound_val) %>%
  ungroup() %>%
  select(-cell_line)


cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("HLF", "NCI-H23", "SW480")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
##             fail     pass
## HLF     47.14334 303.8567
## NCI-H23 38.01016 244.9898
## SW480   33.84650 218.1535

chisquare test

chisq.test(cont_table)
## 
##  Pearson's Chi-squared test
## 
## data:  cont_table
## X-squared = 23.691, df = 2, p-value = 7.171e-06

ANOVA - Sensitivity

first calculate the total nr of validated circ per count group

nr_val_cl = cq %>% 
  # get set of uniquely validated circRNAs
  filter(compound_val == 'pass') %>%
  select(circ_id, cell_line, count_group_median) %>% unique() %>%
  group_by(count_group_median, cell_line) %>%
  summarise(nr_expected = n())
## `summarise()` has grouped output by 'count_group_median'. You can override
## using the `.groups` argument.
nr_val_cl

then calculate sensitivity by dividing nr of circ found by total

sens_cl = cq %>% 
  # get set of uniquely validated circRNAs
  filter(compound_val == 'pass') %>%
  select(circ_id, cell_line, count_group_median) %>% unique() %>%
  # check witch tools have detected these
  left_join(all_circ %>%
              select(tool, circ_id, cell_line) %>% unique()) %>%
  group_by(tool, count_group_median, cell_line) %>% 
  summarise(nr_detected = n()) %>%
  left_join(nr_val_cl) %>%
  mutate(sensitivity = nr_detected/nr_expected) %>%
  ungroup()
## Joining with `by = join_by(circ_id, cell_line)`
## `summarise()` has grouped output by 'tool', 'count_group_median'. You can
## override using the `.groups` argument.
## Joining with `by = join_by(count_group_median, cell_line)`

then ANOVA

sens_cl_anova = sens_cl %>%
  filter(count_group_median == 'count ≥ 5') %>%
  select(tool, cell_line, sensitivity)

sens_cl_anova
sens_cl_anova$cell_line = factor(sens_cl_anova$cell_line, levels = c('HLF', 'NCI-H23', 'SW480'))
sens_cl_anova$tool = factor(sens_cl_anova$tool, levels = c("CIRCexplorer3", "CirComPara2", "circRNA_finder", "circseq_cup",  "CircSplice", "circtools","CIRI2", "CIRIquant", "ecircscreen","find_circ",  "KNIFE",  "NCLcomparator",  "NCLscan", "PFv2","Sailfish-cir", "segemehl"))

res.aov = aov(sensitivity ~ cell_line+tool, data = sens_cl_anova)

summary(res.aov)
##             Df Sum Sq Mean Sq F value Pr(>F)    
## cell_line    2 0.0122 0.00609   4.048 0.0278 *  
## tool        15 1.6454 0.10969  72.866 <2e-16 ***
## Residuals   30 0.0452 0.00151                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
0.0036/(0.0036+1.8085+0.0427)
## [1] 0.00194091
1.8085/(0.0036+1.8085+0.0427)
## [1] 0.9750377

ANOVA - precision

recalculate the precision per cell line

val_cl = cq %>% 
  select(tool, circ_id, cell_line, count_group, qPCR_val, RR_val, RR_val_detail, amp_seq_val, amp_seq_val_detail, compound_val) %>%
  group_by(tool, count_group, cell_line) %>%
  summarise(nr_qPCR_total = n(),
            nr_qPCR_fail = sum(qPCR_val == 'fail'),
            nr_qPCR_val = sum(qPCR_val == 'pass'),
            nr_RR_total = n() - sum(is.na(RR_val)),  # here NA are the ones that have are 'out_of_range'
            nr_RR_fail = sum(RR_val == "fail", na.rm = T),
            nr_RR_val = sum(RR_val == "pass", na.rm = T),
            nr_amp_total = n() - sum(is.na(amp_seq_val)),  # here NA are the ones 'not_included'
            nr_amp_fail = sum(amp_seq_val == "fail", na.rm = T),
            nr_amp_val = sum(amp_seq_val == "pass", na.rm = T),
            nr_compound_total = n() - sum(is.na(amp_seq_val)), # here NA are the ones 'not_included
            nr_compound_fail = sum(compound_val == "fail", na.rm = T),
            nr_compound_val = sum(compound_val == "pass", na.rm = T)) %>%
  mutate(perc_qPCR_val = nr_qPCR_val/nr_qPCR_total,
         perc_RR_val = nr_RR_val/nr_RR_total,
         perc_amp_val = nr_amp_val/nr_amp_total,
         perc_compound_val = nr_compound_val/nr_compound_total) %>%
  ungroup()
## `summarise()` has grouped output by 'tool', 'count_group'. You can override
## using the `.groups` argument.

ANOVA

val_cl_anova = val_cl %>%
  filter(count_group == 'count ≥ 5') %>%
  select(tool, cell_line, perc_qPCR_val, perc_RR_val, perc_amp_val, perc_compound_val)

val_cl_anova
val_cl_anova$cell_line = factor(val_cl_anova$cell_line, levels = c('HLF', 'NCI-H23', 'SW480'))
val_cl_anova$tool = factor(val_cl_anova$tool, levels = c("CIRCexplorer3", "CirComPara2", "circRNA_finder", "circseq_cup",  "CircSplice", "circtools","CIRI2", "CIRIquant", "ecircscreen","find_circ",  "KNIFE",  "NCLcomparator",  "NCLscan", "PFv2","Sailfish-cir", "segemehl"))

RT-qPCR precision

res.aov = aov(perc_qPCR_val ~ cell_line+tool, data = val_cl_anova)
summary(res.aov)
##             Df   Sum Sq  Mean Sq F value  Pr(>F)   
## cell_line    2 0.008126 0.004063   6.586 0.00453 **
## tool        14 0.024932 0.001781   2.887 0.00821 **
## Residuals   28 0.017273 0.000617                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
0.008126/(0.008126+0.024932+0.017273)
## [1] 0.1614512
0.024932/(0.008126+0.024932+0.017273)
## [1] 0.4953607

RR precision

res.aov = aov(perc_RR_val ~ cell_line+tool, data = val_cl_anova)
summary(res.aov)
##             Df  Sum Sq  Mean Sq F value  Pr(>F)   
## cell_line    2 0.05260 0.026300   8.642 0.00119 **
## tool        14 0.16010 0.011436   3.758 0.00140 **
## Residuals   28 0.08521 0.003043                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
0.026300/(0.026300+0.011436+0.003043)
## [1] 0.6449398
0.011436/(0.026300+0.011436+0.003043)
## [1] 0.2804385

ampl seq precision

res.aov = aov(perc_amp_val ~ cell_line+tool, data = val_cl_anova)
summary(res.aov)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## cell_line    2 0.0187 0.00935   3.375   0.0486 *  
## tool        14 1.2629 0.09020  32.555 1.32e-13 ***
## Residuals   28 0.0776 0.00277                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
0.00935/(0.00935+0.09020+0.00279)
## [1] 0.09136213
0.09020/(0.00935+0.09020+0.00279)
## [1] 0.8813758

compound precision

res.aov = aov(perc_compound_val ~ cell_line+tool, data = val_cl_anova)
summary(res.aov)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## cell_line    2 0.1250 0.06252   17.73 1.06e-05 ***
## tool        14 1.3220 0.09443   26.77 1.59e-12 ***
## Residuals   28 0.0988 0.00353                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
0.06252/(0.06252+0.09443+0.00353)
## [1] 0.3895813
0.09443/(0.06252+0.09443+0.00353)
## [1] 0.5884222

Combination of tools

simple_union = read_tsv('../data/Supplementary_Table_7_combo_2tools.txt')
## Rows: 675 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool_1, tool_2, cell_line, count_group, tool_lt_1, tool_lt_2, tool_...
## dbl (9): nr_union, nr_intersection, perc_compound_val_1, perc_compound_val_2...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tool_annotation = read_tsv('../data/details/tool_annotation.txt')
## Rows: 16 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool, approach, tool_lt, lin_annotation, strand_anno, splicing, BSJ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Approach

union_test = simple_union %>%
  filter(count_group == "count ≥ 5",
         !tool_1 == tool_2) %>%
  left_join(tool_annotation %>% 
              select(tool, approach) %>%
              rename(tool_1 = tool, approach_1 = approach)) %>%
  left_join(tool_annotation %>% 
              select(tool, approach) %>%
              rename(tool_2 = tool, approach_2 = approach)) %>%
  mutate(approach = ifelse(approach_1 == approach_2, 'same', 'different')) %>%
  select(tool_1, tool_2, cell_line, perc_increase_t1, approach)
## Joining with `by = join_by(tool_1)`
## Joining with `by = join_by(tool_2)`
union_test
union_test %>%
  group_by(approach) %>%
  summarise(median_increase_perc = median(perc_increase_t1),
            mean_increase_perc = mean(perc_increase_t1))
union_test %>% rstatix::wilcox_test(perc_increase_t1 ~ approach)
union_test %>% rstatix::wilcox_effsize(perc_increase_t1 ~ approach , ci = T)

Linear annotation

union_test = simple_union %>%
  filter(count_group == "count ≥ 5",
         !tool_1 == tool_2) %>%
  left_join(tool_annotation %>% 
              select(tool, lin_annotation) %>%
              rename(tool_1 = tool, approach_1 = lin_annotation)) %>%
  left_join(tool_annotation %>% 
              select(tool, lin_annotation) %>%
              rename(tool_2 = tool, approach_2 = lin_annotation)) %>%
  mutate(approach = ifelse(approach_1 == approach_2, 'same', 'different')) %>%
  select(tool_1, tool_2, cell_line, perc_increase_t1, approach)
## Joining with `by = join_by(tool_1)`
## Joining with `by = join_by(tool_2)`
union_test
union_test %>%
  group_by(approach) %>%
  summarise(median_increase_perc = median(perc_increase_t1),
            mean_increase_perc = mean(perc_increase_t1))
union_test %>% rstatix::wilcox_test(perc_increase_t1 ~ approach)
union_test %>% rstatix::wilcox_effsize(perc_increase_t1 ~ approach , ci = T)

Strand annotation

union_test = simple_union %>%
  filter(count_group == "count ≥ 5",
         !tool_1 == tool_2) %>%
  left_join(tool_annotation %>% 
              select(tool, strand_anno) %>%
              rename(tool_1 = tool, approach_1 = strand_anno)) %>%
  left_join(tool_annotation %>% 
              select(tool, strand_anno) %>%
              rename(tool_2 = tool, approach_2 = strand_anno)) %>%
  filter(!approach_1 == "no strand reported", 
         !approach_2 == 'no strand reported') %>%
  mutate(approach = ifelse(approach_1 == approach_2, 'same', 'different')) %>%
  select(tool_1, tool_2, cell_line, perc_increase_t1, approach)
## Joining with `by = join_by(tool_1)`
## Joining with `by = join_by(tool_2)`
union_test
union_test %>%
  group_by(approach) %>%
  summarise(median_increase_perc = median(perc_increase_t1),
            mean_increase_perc = mean(perc_increase_t1))
union_test %>% rstatix::wilcox_test(perc_increase_t1 ~ approach)
union_test %>% rstatix::wilcox_effsize(perc_increase_t1 ~ approach , ci = T)

Splice sites

union_test = simple_union %>%
  filter(count_group == "count ≥ 5",
         !tool_1 == tool_2) %>%
  left_join(tool_annotation %>% 
              select(tool, splicing) %>%
              rename(tool_1 = tool, approach_1 = splicing)) %>%
  left_join(tool_annotation %>% 
              select(tool, splicing) %>%
              rename(tool_2 = tool, approach_2 = splicing)) %>%
  mutate(approach = ifelse(approach_1 == approach_2, 'same', 'different')) %>%
  select(tool_1, tool_2, cell_line, perc_increase_t1, approach)
## Joining with `by = join_by(tool_1)`
## Joining with `by = join_by(tool_2)`
union_test
union_test %>%
  group_by(approach) %>%
  summarise(median_increase_perc = median(perc_increase_t1),
            mean_increase_perc = mean(perc_increase_t1))
union_test %>% rstatix::wilcox_test(perc_increase_t1 ~ approach)
union_test %>% rstatix::wilcox_effsize(perc_increase_t1 ~ approach , ci = T)

RNase R enrichment in sequencing data

treatment = read_tsv('../data/Supplementary_Table_5_RNase_R_enrichment_seq.txt')
## Rows: 2303689 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (8): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group,...
## dbl (9): start, end, count_UT, count_T, nr_reads_UT, nr_reads_T, cpm_T, cpm_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cont_table = treatment %>%
  # remove Sailfish-cir as it does not report counts
  filter(!tool == 'Sailfish-cir') %>%
  # remove NAs
  filter(!is.na(count_UT), !is.na(count_T)) %>%
  # add median count group
  left_join(all_circ %>% 
              select(circ_id_strand, count_group_median, cell_line) %>% unique()) %>%
  select(circ_id_strand, cell_line, tool, enrichment_bin, count_group_median) %>% 
  group_by(count_group_median) %>%
  count(enrichment_bin) %>%
  pivot_wider(values_from = n, names_from = enrichment_bin) %>%
  ungroup() %>%
  select(-count_group_median)
## Joining with `by = join_by(cell_line, circ_id_strand)`
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("count < 5", "count ≥ 5")

cont_table
mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

chisq.test(cont_table)$expected
##           enriched not enriched
## count < 5   302867     53352.05
## count ≥ 5   119751     21094.95
chisq.test(cont_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 1166.7, df = 1, p-value < 2.2e-16

OR

OR = (cont_table[1,2]/cont_table[1,1])/(cont_table[2,2]/cont_table[2,1])
OR
## [1] 1.37386